Overview of data

#str(df)

# clean dataframe to keep useful columns  

tidy_df <- df %>% 
  dplyr::select(year, cause_name, state, deaths, age_adjusted_death_rate) %>%   
  mutate(cause = as.factor(cause_name),
         state = as.factor(state),
         rate = age_adjusted_death_rate/100) %>%  #rate is in decimal value 
  dplyr::select(year, state, cause, deaths, rate) %>% 
  
  # filter out "all causes"
  filter(cause != "All causes") %>% 
  
  # replace "CLRD" with its whole name
  mutate(cause = replace(as.character(cause), 
                         cause == "CLRD", 
                         "Chronic lower respiratory diseases"))

head(tidy_df)
## # A tibble: 6 x 5
##    year state      cause                  deaths  rate
##   <int> <fct>      <chr>                   <int> <dbl>
## 1  2016 Alabama    Unintentional injuries   2755 0.555
## 2  2016 Alaska     Unintentional injuries    439 0.631
## 3  2016 Arizona    Unintentional injuries   4010 0.542
## 4  2016 Arkansas   Unintentional injuries   1604 0.518
## 5  2016 California Unintentional injuries  13213 0.32 
## 6  2016 Colorado   Unintentional injuries   2880 0.512
skimr::skim(tidy_df)
## Skim summary statistics
##  n obs: 9360 
##  n variables: 5 
## 
## ── Variable type:character ───────────────────────────────────────────────────────────────
##  variable missing complete    n min max empty n_unique
##     cause       0     9360 9360   6  34     0       10
## 
## ── Variable type:factor ──────────────────────────────────────────────────────────────────
##  variable missing complete    n n_unique
##     state       0     9360 9360       52
##                              top_counts ordered
##  Ala: 180, Ala: 180, Ari: 180, Ark: 180   FALSE
## 
## ── Variable type:integer ─────────────────────────────────────────────────────────────────
##  variable missing complete    n    mean       sd   p0  p25    p50    p75
##    deaths       0     9360 9360 7307.49 39644.23   21  547 1440.5 3431.5
##      year       0     9360 9360 2007.5      5.19 1999 2003 2007.5 2012  
##    p100     hist
##  725192 ▇▁▁▁▁▁▁▁
##    2016 ▇▅▅▅▅▅▅▇
## 
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────
##  variable missing complete    n mean   sd    p0  p25  p50  p75 p100
##      rate       0     9360 9360 0.61 0.68 0.026 0.18 0.31 0.55 3.47
##      hist
##  ▇▂▁▁▁▁▁▁

Summary of data:

  1. No missing data.
  2. There are 10 cuases of death, including Unintentional injuries, Alzheimer’s disease, Cancer, Chronic lower respiratory diseases, Diabetes, Heart disease, Influenza and pneumonia, Kidney disease, Stroke, Suicide
  3. There are 52 states.
  4. Dataframe has data from 1999 to 2016 (18 years).

Plot 1

The first plot is intended to deliver a message to general population and is trying to answer what leading death causes are across years.

plot1_v1 <- tidy_df %>% 
  
  # calculate yearly total deaths from all states
  group_by(year, cause) %>% 
  summarise(deaths_by_year_cause = sum(deaths)) %>%
  
  # plot
  ggplot(aes(x = fct_reorder(cause, deaths_by_year_cause), 
             y = deaths_by_year_cause/1000000)) +
    geom_col(fill = "steelblue",
             alpha = 0.7) +
    scale_y_continuous(expand = c(0, 0)) +
    coord_flip() +
    facet_wrap(~year, ncol = 3) +
    theme_minimal(base_size = 20) +
    theme(panel.grid.minor = element_blank(),
          plot.title = element_text(face = "bold")) +
    labs(y = "No. of deaths (in millions)",
         x = "Causes of deaths",
         title = "Heart disease is a leading cause of death over years",
         caption = "Source: Centers for Disease Control and Prevention")
         

plot1_v1

What can be improved?

Too many facets. A total of 18 facets make it hard to compare causes of death across years.

Plot 1: version 2

plot1_v2 <- tidy_df %>% 
  group_by(year, cause) %>% 
  summarise(deaths_by_year_cause = sum(deaths)) %>%
  
  # plot
  ggplot(aes(x = fct_reorder(cause, deaths_by_year_cause), 
             y = deaths_by_year_cause/1000000)) +
    geom_col(fill = "steelblue",
             alpha = 0.7) +
    scale_y_continuous(expand = c(0, 0)) +
    coord_flip() +
    theme_minimal(base_size = 15) +
    theme(panel.grid.minor = element_blank(),
          plot.margin = margin(1, 0.5, 1, 0.5, "cm")) +
    transition_time(year) +
    labs(title = "Top 10 Causes of Deaths",
         subtitle = "Year: {round(frame_time)}",
         caption = "Source: Centers for Disease Control and Prevention",
         y = "No. of deaths (in millions)",
         x = "Causes of deaths")


animate(plot1_v2, duration = 25,  
                  nframes = 100, 
                  end_pause = 10,
                  renderer = gifski_renderer(width = 1000))

What can be improved?

Rank of causes changes. fct_reorder cannot reflect an accurate rank of causes on y-axis.

Plot 1: version 3

plot1_v3 <- tidy_df %>% 
  group_by(year, cause) %>% 
  summarise(deaths_by_year_cause = sum(deaths)) %>% 

  # creat rank per year
  group_by(year) %>% 
  arrange(desc(deaths_by_year_cause)) %>% 
  mutate(rank = row_number()) %>% 
  arrange(year) %>% 
  ungroup() %>% 
  
  #plot
  ggplot(aes(x = '{frame_time}', y = rank)) +
  geom_text(aes(label = cause, 
                color = cause,
                group = cause),
            size = 8) +
  scale_x_discrete(labels = NULL) +
  scale_y_reverse() +
  scale_color_paletteer_d(rcartocolor, Vivid) +
  transition_states(year, 
                    transition_length = 1,
                    state_length = 3) +
  ease_aes('sine-in-out') +
  theme_void(base_size = 28) +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(title = "Rank of causes of death: {closest_state}") +
  guides(color = "none")
  
  

animate(plot1_v3, duration = 30, nframes = 300,
        renderer = gifski_renderer(width = 1200))

Something to be improved (in progress)

I am trying to combine the feasures of version 1 and version 2 to plot bra graph animation with changes in y axis accroding to the rank of cuases.

Plot 2

The second plot is for policy-maker or health-related research and is a quick summary plot regarding changes in death causes over years.

# create a new df so annotation can be added 

df_plot2 <- tidy_df %>% 
  group_by(year, cause) %>% 
  summarise(deaths_by_year_cause = sum(deaths))  
  

df_plot2 %>% 
  ggplot(aes(x = year, y = deaths_by_year_cause/1000000, color = cause)) +
    geom_line(size = 2) +
    scale_x_continuous(breaks = seq(1999, 2016, by = 2),
                       expand = c(0, 0)) +
    scale_y_log10(expand = c(0, 0),
                  breaks = c(0.1, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5)) + 
    scale_color_paletteer_d(rcartocolor, Vivid) +
    theme_minimal(base_size = 30) +
    theme(panel.grid.minor = element_blank()) +
    geom_text(data = filter(df_plot2, year == 2016),
              aes(label = cause),
              nudge_x = 3,
              hjust = 1.5,
              size = 8) +
    guides(color = "none") 

What to be improved?

Annotations of cause seems really hard to be ajusted/organized in a neat way. I failed to get a perfect annotation by ajusting fig.height, fig.width, or nudge_x.

Plot 2 version 2

tidy_df %>% 
  group_by(year, cause) %>% 
  summarise(deaths_by_year_cause = sum(deaths)) %>% 
  mutate(cause = factor(cause, 
                        levels = c("Heart disease", "Cancer", "Unintentional injuries",
                                   "Chronic lower respiratory diseases", "Stroke", "Alzheimer's disease", "Diabetes", "Influenza and pneumonia", "Kidney disease" , "Suicide"  
                                   ))) %>% 
  ggplot(aes(x = year, y = deaths_by_year_cause/1000000, color = cause)) +
    geom_line(size = 2) +
    scale_x_continuous(breaks = seq(1999, 2016, by = 2),
                       expand = c(0, 0)) +
    scale_y_log10(expand = c(0, 0),
                  breaks = c(0.1, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5)) + 
    scale_color_paletteer_d(rcartocolor, Vivid) +
    theme_minimal(base_size = 30) +
    theme(panel.grid.minor = element_blank(),
          legend.key.size = unit(3, 'lines'),
          plot.title = element_text(face = "bold"),
          plot.subtitle = element_text(face = "italic")) +
    labs(title = "Caues of Death Across Years",
         subtitle = "Heart disease and cancer are top 2 causes of deaths.\nAlzheimer's disease increases rapdly.",
         x = "Year",
         y = "No. of deaths (in millions)",
         caption = "Source: Centers for Disease Control and Prevention ",
         color = "")

Plot 3

The last plot is for both parties and is for displaying a distinct death causes in each state.

usa <- as_tibble(map_data("state"))
usa$region <- str_to_title(usa$region)
usa <- usa %>%
  rename(state = region)



distinct_by_state <- tidy_df %>% 
  
  # creat rank per year and state
  group_by(year, state) %>% 
  arrange(desc(deaths)) %>% 
  mutate(rank = row_number()) %>% 
  arrange(year) %>% 
  ungroup() %>% 
  
  # only select rank no. 1
  filter(rank == 1,
         state != "United States")


distinct_map <- full_join(usa, distinct_by_state, by = "state") %>% 
  filter(!is.na(lat), !is.na(long), !is.na(year))
## Warning: Column `state` joining character vector and factor, coercing into
## character vector
# map data with USA states

plot3 <- distinct_map  %>% 
  mutate(cause = factor(cause, levels = c("Heart disease", "Cancer"))) %>% 
  ggplot(aes(long, lat, group = group, fill = cause))+
  geom_polygon(color = "white")+
  coord_map()+
  labs(title = "A Distinct Cause of Death",
       subtitle = "Cancer emerges as a leading cause in early 20th.",
       caption = "Source: Centers for Disease Control and Prevention ",
       fill = "")+
  theme_void(base_size = 35) +
  facet_wrap(~year, ncol = 3) +
  theme(plot.title = element_text(face = "bold",
                                  hjust = 0.5,
                                  vjust = 10),
        plot.subtitle = element_text(face = "italic",
                                     hjust = 0.5,
                                     vjust = 10),
        legend.position = "top")

plot3

Something to be improved (in progress)

I tried to animate the above graph but failed to make the transisition smmothly. Debugin now.

ani_plot3 <- distinct_map  %>% 
  mutate(cause = factor(cause, levels = c("Heart disease", "Cancer"))) %>% 
  ggplot(aes(long, lat, group = group, fill = cause))+
  geom_polygon(color = "white")+
  coord_map()+
  theme_void() +
  transition_time(year) +
  labs(title = "A Distinct Cause of Death\nCancer emerges as a leading cause in early 20th.",
       subtitle = "Year: {round(frame_time)}",
       caption = "Source: Centers for Disease Control and Prevention ",
       fill = "")



animate(ani_plot3, duration = 30, renderer = gifski_renderer(width = 1200))